{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import requests\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data preparation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1. Create a small sample list of companies, from different sectors.\n",
    "2. Collect patent information via PatentsViews' APIs, including patent abstract, classification code, and other sector information."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#create a sample list of companies\n",
    "asg_list = ['Ford motor', 'Toyota', 'Bayerische motoren werke',\n",
    "           'International business machines', 'Google', 'Apple',\n",
    "           'Intel Corporation', 'Huawei', 'Ericsson', 'Qualcomm', 'Nokia Corporation',\n",
    "           'PepsiCo, Inc.', 'The Coca-Cola Company',\n",
    "           'Shell Oil Company']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#use the api supported by PatentsView\n",
    "def getURL(id, tag):\n",
    "    if (tag =='asg'):\n",
    "        url = 'http://www.patentsview.org/api/assignees/query?q={\"_and\":[{\"patent_type\":\"utility\"},{\"_gte\":{\"patent_date\":\"2000-01-01\"}},{\"_lte\":{\"patent_date\":\"2019-12-31\"}},{\"_begins\":{\"assignee_organization\":\"'\n",
    "        url = url + id + '\"}}]}&f=[\"patent_abstract\",\"patent_date\", \"patent_id\"]'\n",
    "    elif (tag == \"cpc\"):\n",
    "        url = 'http://www.patentsview.org/api/patents/query?q={\"patent_number\":\"'\n",
    "        url = url + id + '\"}&f=[\"patent_id\",\"cpc_group_id\",\"wipo_sector_title\",\"nber_subcategory_title\"]'\n",
    "    return url"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getDataset(url):\n",
    "    data = requests.get(url).json()\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def parseData(data, asg_name, info_list):\n",
    "    patent_info = data['assignees']\n",
    "    for i in range(len(patent_info)):\n",
    "        patents = patent_info[i]['patents']\n",
    "        for patn_set in patents:\n",
    "            pid = patn_set['patent_id']\n",
    "            if (pid[0].isdigit()):\n",
    "                patn_abstract = patn_set['patent_abstract']\n",
    "                patn_date = patn_set['patent_date']\n",
    "                info_list.append({\"pid\": pid, \"abstract\":patn_abstract, \"date\": patn_date, \"name\": asg_name})\n",
    "    return info_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "#get all the patents information for the companies in the list\n",
    "info_list = []\n",
    "for asg in asg_list:\n",
    "    data = getDataset(getURL(asg, 'asg'))\n",
    "    info_list = parseData(data, asg, info_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "patent_info_df = pd.DataFrame(info_list)\n",
    "patent_info_df.to_csv('company_patent_info.tsv', index=False, sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>abstract</th>\n",
       "      <th>date</th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7963585</td>\n",
       "      <td>A tonneau system for a vehicle bed includes a ...</td>\n",
       "      <td>2011-06-21</td>\n",
       "      <td>Ford motor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>8277157</td>\n",
       "      <td>The present invention relates to a tie-down de...</td>\n",
       "      <td>2012-10-02</td>\n",
       "      <td>Ford motor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8831857</td>\n",
       "      <td>Methods and systems are provided for operating...</td>\n",
       "      <td>2014-09-09</td>\n",
       "      <td>Ford motor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>8833341</td>\n",
       "      <td>In one example, a method for controlling a fue...</td>\n",
       "      <td>2014-09-16</td>\n",
       "      <td>Ford motor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>9732689</td>\n",
       "      <td>Methods and systems are provided for operating...</td>\n",
       "      <td>2017-08-15</td>\n",
       "      <td>Ford motor</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       pid                                           abstract        date  \\\n",
       "0  7963585  A tonneau system for a vehicle bed includes a ...  2011-06-21   \n",
       "1  8277157  The present invention relates to a tie-down de...  2012-10-02   \n",
       "2  8831857  Methods and systems are provided for operating...  2014-09-09   \n",
       "3  8833341  In one example, a method for controlling a fue...  2014-09-16   \n",
       "4  9732689  Methods and systems are provided for operating...  2017-08-15   \n",
       "\n",
       "         name  \n",
       "0  Ford motor  \n",
       "1  Ford motor  \n",
       "2  Ford motor  \n",
       "3  Ford motor  \n",
       "4  Ford motor  "
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "patent_info_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "266368"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#get classification information according to patent IDs\n",
    "patent_list = list(patent_info_df['pid'])\n",
    "len(patent_list)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To collect classification information via APIs will take a long time, so we just download the dataset supported by Patentsview, and filter specific cagetories for the patent list."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7963585</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>8277157</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8831857</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>8833341</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>9732689</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       pid\n",
       "0  7963585\n",
       "1  8277157\n",
       "2  8831857\n",
       "3  8833341\n",
       "4  9732689"
      ]
     },
     "execution_count": 161,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "patent_class_df = pd.DataFrame(columns=['pid'])\n",
    "patent_class_df['pid'] = patent_info_df['pid'].copy()\n",
    "patent_class_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [],
   "source": [
    "cpc_df = pd.DataFrame(pd.read_csv('../../cpc_current.tsv', sep='\\t'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [],
   "source": [
    "cpc_df['patent_id'] = cpc_df['patent_id'].apply(lambda x: str(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [],
   "source": [
    "patent_class_df = patent_class_df.merge(cpc_df, left_on='pid', right_on='patent_id', how='inner')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {},
   "outputs": [],
   "source": [
    "patent_class_df = patent_class_df.drop(columns=['uuid', 'patent_id', 'section_id', 'subsection_id', 'subgroup_id', 'category', 'sequence'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 178,
   "metadata": {},
   "outputs": [],
   "source": [
    "patent_class_df = patent_class_df.rename(columns={'group_id': 'cpc'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 179,
   "metadata": {},
   "outputs": [],
   "source": [
    "patent_class_df.to_csv('patent_cpc_samples.tsv', index=False, sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#preprocess\n",
    "# wipo_df = pd.DataFrame(pd.read_csv('../../wipo.tsv', sep='\\t'))\n",
    "# nber_df = pd.DataFrame(pd.read_csv('../../nber.tsv', sep='\\t'))\n",
    "# wipo_field_df = pd.DataFrame(pd.read_csv('../../wipo_field.tsv', sep='\\t'))\n",
    "# nber_cg_df = pd.DataFrame(pd.read_csv('../../nber_subcategory.tsv', sep='\\t'))\n",
    "# wipo_df['patent_id'] = wipo_df['patent_id'].apply(lambda x: str(x))\n",
    "# patent_class_df = pd.DataFrame(columns=['pid'])\n",
    "# patent_class_df['pid'] = patent_info_df['pid'].copy()\n",
    "# patent_class_df.head()\n",
    "# len(patent_class_df)\n",
    "\n",
    "# patent_class_df = patent_class_df.merge(wipo_df, left_on='pid', right_on='patent_id', how='inner')\n",
    "# patent_class_df['field_id'] = patent_class_df['field_id'].apply(lambda x: str(x))\n",
    "# patent_class_df = patent_class_df.merge(wipo_field_df, left_on='field_id', right_on='id', how='inner')\n",
    "# patent_class_df = patent_class_df.drop(columns=['patent_id', 'sequence', 'id', 'field_id'], axis=1)\n",
    "# patent_class_df.to_csv('patent_wipo_label.tsv', index=False, sep='\\t')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:py3-tf2]",
   "language": "python",
   "name": "conda-env-py3-tf2-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
